library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(survival)
library(ggsurvfit)
library(gtsummary)
library(glue)
library(labelled)
library(rpart.plot)
## Loading required package: rpart
library(rpart)
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: sandwich
## 
## Attaching package: 'strucchange'
## 
## The following object is masked from 'package:stringr':
## 
##     boundary
## 
## 
## Attaching package: 'party'
## 
## The following object is masked from 'package:gtsummary':
## 
##     where
## 
## The following object is masked from 'package:dplyr':
## 
##     where
library(partykit)
## Loading required package: libcoin
## 
## Attaching package: 'partykit'
## 
## The following objects are masked from 'package:party':
## 
##     cforest, ctree, ctree_control, edge_simple, mob, mob_control,
##     node_barplot, node_bivplot, node_boxplot, node_inner, node_surv,
##     node_terminal, varimp
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:survival':
## 
##     cluster
## 
## The following object is masked from 'package:purrr':
## 
##     lift
load("dat1.RData")
load("dat2.RData")
#mutating factor variables and creating labels for baseline characteristic table
table_data = dat1 |>
  mutate(gender = factor(gender, levels = c(0,1), labels = c("Female","Male")),
         race = factor(race, levels = c(1,2,3,4), labels = c("White","Asian","Black","Hispanic")),
         smoking = factor(smoking, levels = c(0,1,2), labels = c("Never smoked", "Former smoker", "Current smoker")),
         diabetes = factor(diabetes, levels = c(0,1), labels = c("No","Yes")),
         hypertension = factor(hypertension, levels = c(0,1), labels = c("No", "Yes")))

#creating table labels
var_label(table_data) = list(
  age = "Age (in years)",
  gender = "Gender",
  race = "Race/ethnicity",
  smoking = "Smoking status",
  height = "Height (in centimeters)",
  weight = "Weight (in kilograms)",
  bmi = "BMI (body mass index)",
  diabetes = "Diabetes",
  hypertension = "Hypertension",
  SBP = "Systolic Blood Pressure (mmHg)",
  LDL = "LDL Cholesterol (mg/dL)",
  time = "Time since vaccination (in days)"
)
#outputting table
table = table_data |>
  select(age, gender, race, smoking, height, weight, bmi, diabetes, hypertension, SBP, LDL, time) |>
  tbl_summary(
    missing_text = "(Missing)",
    statistic = list(all_continuous() ~ "{median} ± {sd}")
  ) 

print(table)
## <div id="eaasytoupi" style="padding-left:0px;padding-right:0px;padding-top:10px;padding-bottom:10px;overflow-x:auto;overflow-y:auto;width:auto;height:auto;">
##   <style>#eaasytoupi table {
##   font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji';
##   -webkit-font-smoothing: antialiased;
##   -moz-osx-font-smoothing: grayscale;
## }
## 
## #eaasytoupi thead, #eaasytoupi tbody, #eaasytoupi tfoot, #eaasytoupi tr, #eaasytoupi td, #eaasytoupi th {
##   border-style: none;
## }
## 
## #eaasytoupi p {
##   margin: 0;
##   padding: 0;
## }
## 
## #eaasytoupi .gt_table {
##   display: table;
##   border-collapse: collapse;
##   line-height: normal;
##   margin-left: auto;
##   margin-right: auto;
##   color: #333333;
##   font-size: 16px;
##   font-weight: normal;
##   font-style: normal;
##   background-color: #FFFFFF;
##   width: auto;
##   border-top-style: solid;
##   border-top-width: 2px;
##   border-top-color: #A8A8A8;
##   border-right-style: none;
##   border-right-width: 2px;
##   border-right-color: #D3D3D3;
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #A8A8A8;
##   border-left-style: none;
##   border-left-width: 2px;
##   border-left-color: #D3D3D3;
## }
## 
## #eaasytoupi .gt_caption {
##   padding-top: 4px;
##   padding-bottom: 4px;
## }
## 
## #eaasytoupi .gt_title {
##   color: #333333;
##   font-size: 125%;
##   font-weight: initial;
##   padding-top: 4px;
##   padding-bottom: 4px;
##   padding-left: 5px;
##   padding-right: 5px;
##   border-bottom-color: #FFFFFF;
##   border-bottom-width: 0;
## }
## 
## #eaasytoupi .gt_subtitle {
##   color: #333333;
##   font-size: 85%;
##   font-weight: initial;
##   padding-top: 3px;
##   padding-bottom: 5px;
##   padding-left: 5px;
##   padding-right: 5px;
##   border-top-color: #FFFFFF;
##   border-top-width: 0;
## }
## 
## #eaasytoupi .gt_heading {
##   background-color: #FFFFFF;
##   text-align: center;
##   border-bottom-color: #FFFFFF;
##   border-left-style: none;
##   border-left-width: 1px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 1px;
##   border-right-color: #D3D3D3;
## }
## 
## #eaasytoupi .gt_bottom_border {
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
## }
## 
## #eaasytoupi .gt_col_headings {
##   border-top-style: solid;
##   border-top-width: 2px;
##   border-top-color: #D3D3D3;
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
##   border-left-style: none;
##   border-left-width: 1px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 1px;
##   border-right-color: #D3D3D3;
## }
## 
## #eaasytoupi .gt_col_heading {
##   color: #333333;
##   background-color: #FFFFFF;
##   font-size: 100%;
##   font-weight: normal;
##   text-transform: inherit;
##   border-left-style: none;
##   border-left-width: 1px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 1px;
##   border-right-color: #D3D3D3;
##   vertical-align: bottom;
##   padding-top: 5px;
##   padding-bottom: 6px;
##   padding-left: 5px;
##   padding-right: 5px;
##   overflow-x: hidden;
## }
## 
## #eaasytoupi .gt_column_spanner_outer {
##   color: #333333;
##   background-color: #FFFFFF;
##   font-size: 100%;
##   font-weight: normal;
##   text-transform: inherit;
##   padding-top: 0;
##   padding-bottom: 0;
##   padding-left: 4px;
##   padding-right: 4px;
## }
## 
## #eaasytoupi .gt_column_spanner_outer:first-child {
##   padding-left: 0;
## }
## 
## #eaasytoupi .gt_column_spanner_outer:last-child {
##   padding-right: 0;
## }
## 
## #eaasytoupi .gt_column_spanner {
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
##   vertical-align: bottom;
##   padding-top: 5px;
##   padding-bottom: 5px;
##   overflow-x: hidden;
##   display: inline-block;
##   width: 100%;
## }
## 
## #eaasytoupi .gt_spanner_row {
##   border-bottom-style: hidden;
## }
## 
## #eaasytoupi .gt_group_heading {
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
##   color: #333333;
##   background-color: #FFFFFF;
##   font-size: 100%;
##   font-weight: initial;
##   text-transform: inherit;
##   border-top-style: solid;
##   border-top-width: 2px;
##   border-top-color: #D3D3D3;
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
##   border-left-style: none;
##   border-left-width: 1px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 1px;
##   border-right-color: #D3D3D3;
##   vertical-align: middle;
##   text-align: left;
## }
## 
## #eaasytoupi .gt_empty_group_heading {
##   padding: 0.5px;
##   color: #333333;
##   background-color: #FFFFFF;
##   font-size: 100%;
##   font-weight: initial;
##   border-top-style: solid;
##   border-top-width: 2px;
##   border-top-color: #D3D3D3;
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
##   vertical-align: middle;
## }
## 
## #eaasytoupi .gt_from_md > :first-child {
##   margin-top: 0;
## }
## 
## #eaasytoupi .gt_from_md > :last-child {
##   margin-bottom: 0;
## }
## 
## #eaasytoupi .gt_row {
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
##   margin: 10px;
##   border-top-style: solid;
##   border-top-width: 1px;
##   border-top-color: #D3D3D3;
##   border-left-style: none;
##   border-left-width: 1px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 1px;
##   border-right-color: #D3D3D3;
##   vertical-align: middle;
##   overflow-x: hidden;
## }
## 
## #eaasytoupi .gt_stub {
##   color: #333333;
##   background-color: #FFFFFF;
##   font-size: 100%;
##   font-weight: initial;
##   text-transform: inherit;
##   border-right-style: solid;
##   border-right-width: 2px;
##   border-right-color: #D3D3D3;
##   padding-left: 5px;
##   padding-right: 5px;
## }
## 
## #eaasytoupi .gt_stub_row_group {
##   color: #333333;
##   background-color: #FFFFFF;
##   font-size: 100%;
##   font-weight: initial;
##   text-transform: inherit;
##   border-right-style: solid;
##   border-right-width: 2px;
##   border-right-color: #D3D3D3;
##   padding-left: 5px;
##   padding-right: 5px;
##   vertical-align: top;
## }
## 
## #eaasytoupi .gt_row_group_first td {
##   border-top-width: 2px;
## }
## 
## #eaasytoupi .gt_row_group_first th {
##   border-top-width: 2px;
## }
## 
## #eaasytoupi .gt_summary_row {
##   color: #333333;
##   background-color: #FFFFFF;
##   text-transform: inherit;
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
## }
## 
## #eaasytoupi .gt_first_summary_row {
##   border-top-style: solid;
##   border-top-color: #D3D3D3;
## }
## 
## #eaasytoupi .gt_first_summary_row.thick {
##   border-top-width: 2px;
## }
## 
## #eaasytoupi .gt_last_summary_row {
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
## }
## 
## #eaasytoupi .gt_grand_summary_row {
##   color: #333333;
##   background-color: #FFFFFF;
##   text-transform: inherit;
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
## }
## 
## #eaasytoupi .gt_first_grand_summary_row {
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
##   border-top-style: double;
##   border-top-width: 6px;
##   border-top-color: #D3D3D3;
## }
## 
## #eaasytoupi .gt_last_grand_summary_row_top {
##   padding-top: 8px;
##   padding-bottom: 8px;
##   padding-left: 5px;
##   padding-right: 5px;
##   border-bottom-style: double;
##   border-bottom-width: 6px;
##   border-bottom-color: #D3D3D3;
## }
## 
## #eaasytoupi .gt_striped {
##   background-color: rgba(128, 128, 128, 0.05);
## }
## 
## #eaasytoupi .gt_table_body {
##   border-top-style: solid;
##   border-top-width: 2px;
##   border-top-color: #D3D3D3;
##   border-bottom-style: solid;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
## }
## 
## #eaasytoupi .gt_footnotes {
##   color: #333333;
##   background-color: #FFFFFF;
##   border-bottom-style: none;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
##   border-left-style: none;
##   border-left-width: 2px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 2px;
##   border-right-color: #D3D3D3;
## }
## 
## #eaasytoupi .gt_footnote {
##   margin: 0px;
##   font-size: 90%;
##   padding-top: 4px;
##   padding-bottom: 4px;
##   padding-left: 5px;
##   padding-right: 5px;
## }
## 
## #eaasytoupi .gt_sourcenotes {
##   color: #333333;
##   background-color: #FFFFFF;
##   border-bottom-style: none;
##   border-bottom-width: 2px;
##   border-bottom-color: #D3D3D3;
##   border-left-style: none;
##   border-left-width: 2px;
##   border-left-color: #D3D3D3;
##   border-right-style: none;
##   border-right-width: 2px;
##   border-right-color: #D3D3D3;
## }
## 
## #eaasytoupi .gt_sourcenote {
##   font-size: 90%;
##   padding-top: 4px;
##   padding-bottom: 4px;
##   padding-left: 5px;
##   padding-right: 5px;
## }
## 
## #eaasytoupi .gt_left {
##   text-align: left;
## }
## 
## #eaasytoupi .gt_center {
##   text-align: center;
## }
## 
## #eaasytoupi .gt_right {
##   text-align: right;
##   font-variant-numeric: tabular-nums;
## }
## 
## #eaasytoupi .gt_font_normal {
##   font-weight: normal;
## }
## 
## #eaasytoupi .gt_font_bold {
##   font-weight: bold;
## }
## 
## #eaasytoupi .gt_font_italic {
##   font-style: italic;
## }
## 
## #eaasytoupi .gt_super {
##   font-size: 65%;
## }
## 
## #eaasytoupi .gt_footnote_marks {
##   font-size: 75%;
##   vertical-align: 0.4em;
##   position: initial;
## }
## 
## #eaasytoupi .gt_asterisk {
##   font-size: 100%;
##   vertical-align: 0;
## }
## 
## #eaasytoupi .gt_indent_1 {
##   text-indent: 5px;
## }
## 
## #eaasytoupi .gt_indent_2 {
##   text-indent: 10px;
## }
## 
## #eaasytoupi .gt_indent_3 {
##   text-indent: 15px;
## }
## 
## #eaasytoupi .gt_indent_4 {
##   text-indent: 20px;
## }
## 
## #eaasytoupi .gt_indent_5 {
##   text-indent: 25px;
## }
## 
## #eaasytoupi .katex-display {
##   display: inline-flex !important;
##   margin-bottom: 0.75em !important;
## }
## 
## #eaasytoupi div.Reactable > div.rt-table > div.rt-thead > div.rt-tr.rt-tr-group-header > div.rt-th-group:after {
##   height: 0px !important;
## }
## </style>
##   <table class="gt_table" data-quarto-disable-processing="false" data-quarto-bootstrap="false">
##   <thead>
##     <tr class="gt_col_headings">
##       <th class="gt_col_heading gt_columns_bottom_border gt_left" rowspan="1" colspan="1" scope="col" id="label"><span class='gt_from_md'><strong>Characteristic</strong></span></th>
##       <th class="gt_col_heading gt_columns_bottom_border gt_center" rowspan="1" colspan="1" scope="col" id="stat_0"><span class='gt_from_md'><strong>N = 5,000</strong></span><span class="gt_footnote_marks" style="white-space:nowrap;font-style:italic;font-weight:normal;line-height:0;"><sup>1</sup></span></th>
##     </tr>
##   </thead>
##   <tbody class="gt_table_body">
##     <tr><td headers="label" class="gt_row gt_left">Age (in years)</td>
## <td headers="stat_0" class="gt_row gt_center">60.0 ± 4.5</td></tr>
##     <tr><td headers="label" class="gt_row gt_left">Gender</td>
## <td headers="stat_0" class="gt_row gt_center"><br /></td></tr>
##     <tr><td headers="label" class="gt_row gt_left">    Female</td>
## <td headers="stat_0" class="gt_row gt_center">2,573 (51%)</td></tr>
##     <tr><td headers="label" class="gt_row gt_left">    Male</td>
## <td headers="stat_0" class="gt_row gt_center">2,427 (49%)</td></tr>
##     <tr><td headers="label" class="gt_row gt_left">Race/ethnicity</td>
## <td headers="stat_0" class="gt_row gt_center"><br /></td></tr>
##     <tr><td headers="label" class="gt_row gt_left">    White</td>
## <td headers="stat_0" class="gt_row gt_center">3,221 (64%)</td></tr>
##     <tr><td headers="label" class="gt_row gt_left">    Asian</td>
## <td headers="stat_0" class="gt_row gt_center">278 (5.6%)</td></tr>
##     <tr><td headers="label" class="gt_row gt_left">    Black</td>
## <td headers="stat_0" class="gt_row gt_center">1,036 (21%)</td></tr>
##     <tr><td headers="label" class="gt_row gt_left">    Hispanic</td>
## <td headers="stat_0" class="gt_row gt_center">465 (9.3%)</td></tr>
##     <tr><td headers="label" class="gt_row gt_left">Smoking status</td>
## <td headers="stat_0" class="gt_row gt_center"><br /></td></tr>
##     <tr><td headers="label" class="gt_row gt_left">    Never smoked</td>
## <td headers="stat_0" class="gt_row gt_center">3,010 (60%)</td></tr>
##     <tr><td headers="label" class="gt_row gt_left">    Former smoker</td>
## <td headers="stat_0" class="gt_row gt_center">1,504 (30%)</td></tr>
##     <tr><td headers="label" class="gt_row gt_left">    Current smoker</td>
## <td headers="stat_0" class="gt_row gt_center">486 (9.7%)</td></tr>
##     <tr><td headers="label" class="gt_row gt_left">Height (in centimeters)</td>
## <td headers="stat_0" class="gt_row gt_center">170.1 ± 5.9</td></tr>
##     <tr><td headers="label" class="gt_row gt_left">Weight (in kilograms)</td>
## <td headers="stat_0" class="gt_row gt_center">80 ± 7</td></tr>
##     <tr><td headers="label" class="gt_row gt_left">BMI (body mass index)</td>
## <td headers="stat_0" class="gt_row gt_center">27.60 ± 2.76</td></tr>
##     <tr><td headers="label" class="gt_row gt_left">Diabetes</td>
## <td headers="stat_0" class="gt_row gt_center">772 (15%)</td></tr>
##     <tr><td headers="label" class="gt_row gt_left">Hypertension</td>
## <td headers="stat_0" class="gt_row gt_center">2,298 (46%)</td></tr>
##     <tr><td headers="label" class="gt_row gt_left">Systolic Blood Pressure (mmHg)</td>
## <td headers="stat_0" class="gt_row gt_center">130 ± 8</td></tr>
##     <tr><td headers="label" class="gt_row gt_left">LDL Cholesterol (mg/dL)</td>
## <td headers="stat_0" class="gt_row gt_center">110 ± 20</td></tr>
##     <tr><td headers="label" class="gt_row gt_left">Time since vaccination (in days)</td>
## <td headers="stat_0" class="gt_row gt_center">106 ± 43</td></tr>
##   </tbody>
##   
##   <tfoot class="gt_footnotes">
##     <tr>
##       <td class="gt_footnote" colspan="2"><span class="gt_footnote_marks" style="white-space:nowrap;font-style:italic;font-weight:normal;line-height:0;"><sup>1</sup></span> <span class='gt_from_md'>Median ± SD; n (%)</span></td>
##     </tr>
##   </tfoot>
## </table>
## </div>
##Distribution of Log Antibody Levels by Race/Ethnicity
age_antibody <- ggplot(table_data, aes(x = age, y = log_antibody)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(x = "Age", y = "Average Log Antibody Level", 
       title = "Average Log Antibody Levels by Age") +
  theme_minimal()

age_antibody

#Average Log Antibody Levels by Gender"
gender_antibody <- ggplot(table_data, aes(x = factor(gender), y = log_antibody)) +
  geom_boxplot() +
  labs(x = "Gender", y = "Log Antibody Level") +
  theme_minimal() 

gender_antibody

#Distribution of Log Antibody Levels by Race/Ethnicity
race_antibody <- ggplot(table_data, aes(x = log_antibody, fill = factor(race))) +
  geom_histogram(binwidth = 0.1, position = "identity", alpha = 0.6, color = "black") +
  labs(x = "Log Antibody Level", y = "Count", title = "Distribution of Log Antibody Levels by Race/Ethnicity") +
  scale_fill_manual(values = c("blue", "orange", "green", "purple"), 
                    labels = c("1" = "White", "2" = "Asian", "3" = "Black", "4" = "Hispanic")) +
  theme_minimal() +
  theme(legend.title = element_blank())

race_antibody

#Average Log Antibody Levels by Smoking Status
smoking_antibody <- ggplot(table_data, aes(x = factor(smoking), y = log_antibody)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(x = "Smoking Status", y = "Average Log Antibody Level", 
       title = "Average Log Antibody Levels by Smoking Status") +
  scale_x_discrete(labels = c("0" = "Never Smoked", 
                               "1" = "Former Smoker", 
                               "2" = "Current Smoker")) +
  theme_minimal()

smoking_antibody

#Distribution of Log Antibody Levels by BMI 
bmi_antibody <- ggplot(table_data, aes_string(x = "bmi", y = "log_antibody")) +
  geom_point(color = "darkgreen", alpha = 0.5) +
  geom_smooth(method = "loess", span = 0.5, color = "red", se = FALSE) +
  theme_bw() +
  labs(x = "BMI", y = "Log Antibody Levels")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
bmi_antibody
## `geom_smooth()` using formula = 'y ~ x'

#Average Log Antibody levels by diabetes status
diabetes_antibody <- ggplot(table_data, aes(x = factor(diabetes), y = log_antibody)) +
  geom_boxplot() +
  labs(x = "Diabetes", y = "Log Antibody Level") +
  theme_minimal()

diabetes_antibody

#Average Log Antibody levels by hypertension status
hypertension_antibody <- ggplot(table_data, aes(x = factor(hypertension), y = log_antibody)) +
  geom_boxplot() +
  labs(x = "Hypertension", y = "Log Antibody Level") +
  theme_minimal()

hypertension_antibody

#Distribution of Log Antibody Levels by SBP 
sbp_antibody <- ggplot(table_data, aes_string(x = "SBP", y = "log_antibody")) +
  geom_point(color = "darkgreen", alpha = 0.5) +
  geom_smooth(method = "loess", span = 0.5, color = "red", se = FALSE) +
  theme_bw() +
  labs(x = "SBP", y = "Log Antibody Levels")

sbp_antibody 
## `geom_smooth()` using formula = 'y ~ x'

#Distribution of Log Antibody Levels by LDL
LDL_antibody <- ggplot(table_data, aes_string(x = "LDL", y = "log_antibody")) +
  geom_point(color = "darkgreen", alpha = 0.5) +
  geom_smooth(method = "loess", span = 0.5, color = "red", se = FALSE) +
  theme_bw() +
  labs(x = "LDL", y = "Log Antibody Levels")

LDL_antibody
## `geom_smooth()` using formula = 'y ~ x'

#Distribution of Log Antibody Levels by Time Since Vaccination 
time_antibody <- ggplot(table_data, aes_string(x = "time", y = "log_antibody")) +
  geom_point(color = "darkgreen", alpha = 0.5) +
  geom_smooth(method = "loess", span = 0.5, color = "red", se = FALSE) +
  theme_bw() +
  labs(x = "Time Since Vaccination", y = "Log Antibody Levels")

time_antibody
## `geom_smooth()` using formula = 'y ~ x'

#age & antibody (p <2e-16 for intercept and age)
b <- lm(log_antibody ~ age, data = table_data)
summary(b) 
## 
## Call:
## lm(formula = log_antibody ~ age, data = table_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.35812 -0.38225  0.02436  0.41087  1.87772 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.260010   0.111368  101.11   <2e-16 ***
## age         -0.019938   0.001852  -10.77   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5897 on 4998 degrees of freedom
## Multiple R-squared:  0.02267,    Adjusted R-squared:  0.02247 
## F-statistic: 115.9 on 1 and 4998 DF,  p-value: < 2.2e-16
#gender & antibody (t = 17.523, df = 4972.7, p-value < 2.2e-16)
t.test(log_antibody ~ gender, data = table_data)
## 
##  Welch Two Sample t-test
## 
## data:  log_antibody by gender
## t = 17.523, df = 4972.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  0.2550640 0.3193275
## sample estimates:
## mean in group Female   mean in group Male 
##             10.20375              9.91655
#race & antibody (Pr(>F) = 0.367)
anova_race <- aov(log_antibody ~ race, data = table_data)
summary(anova_race)
##               Df Sum Sq Mean Sq F value Pr(>F)
## race           3    1.1  0.3754   1.055  0.367
## Residuals   4996 1777.5  0.3558
#smoking & antibody (Pr(>F) = 1.95e-10)
anova_smoke <- aov(log_antibody ~ smoking, data = table_data)
summary(anova_smoke)
##               Df Sum Sq Mean Sq F value   Pr(>F)    
## smoking        2   15.8   7.922   22.46 1.95e-10 ***
## Residuals   4997 1762.8   0.353                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# bmi & antibody (p-value: < 2.2e-16)
bmi_linear <- lm(log_antibody ~ bmi, data = table_data)
summary(bmi_linear) 
## 
## Call:
## lm(formula = log_antibody ~ bmi, data = table_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.10461 -0.37352  0.02197  0.39745  1.74852 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.41935    0.08307  137.47   <2e-16 ***
## bmi         -0.04885    0.00298  -16.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5811 on 4998 degrees of freedom
## Multiple R-squared:  0.05102,    Adjusted R-squared:  0.05083 
## F-statistic: 268.7 on 1 and 4998 DF,  p-value: < 2.2e-16
#diabetes & antibody p-value = 0.6828
t.test(log_antibody ~ diabetes, data = table_data)
## 
##  Welch Two Sample t-test
## 
## data:  log_antibody by diabetes
## t = -0.40872, df = 1077.9, p-value = 0.6828
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  -0.05498657  0.03602815
## sample estimates:
##  mean in group No mean in group Yes 
##          10.06288          10.07236
#hypertension & antibody p-value = 5.491e-05
t.test(log_antibody ~ hypertension, data = table_data) 
## 
##  Welch Two Sample t-test
## 
## data:  log_antibody by hypertension
## t = 4.0373, df = 4891.7, p-value = 5.491e-05
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  0.03505772 0.10124460
## sample estimates:
##  mean in group No mean in group Yes 
##          10.09566          10.02751
#sbp and antibody p-value: 1.451e-05
sbp_linear <- lm(log_antibody ~ SBP, data = table_data)
summary(sbp_linear)  
## 
## Call:
## lm(formula = log_antibody ~ SBP, data = table_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.27107 -0.38295  0.02501  0.40767  1.91119 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.657727   0.136973   77.81  < 2e-16 ***
## SBP         -0.004568   0.001052   -4.34 1.45e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5954 on 4998 degrees of freedom
## Multiple R-squared:  0.003755,   Adjusted R-squared:  0.003556 
## F-statistic: 18.84 on 1 and 4998 DF,  p-value: 1.451e-05
#LDL and antibody p-value: 0.01143
LDL_linear <- lm(log_antibody ~ LDL, data = table_data)
summary(LDL_linear)
## 
## Call:
## lm(formula = log_antibody ~ LDL, data = table_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.27236 -0.38601  0.02812  0.41348  1.85900 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.1807375  0.0467688  217.68   <2e-16 ***
## LDL         -0.0010590  0.0004186   -2.53   0.0114 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5962 on 4998 degrees of freedom
## Multiple R-squared:  0.001279,   Adjusted R-squared:  0.001079 
## F-statistic: 6.402 on 1 and 4998 DF,  p-value: 0.01143
#time and antibody p-value: 0.3278
time_linear <- lm(log_antibody ~ time, data = table_data)
summary(time_linear)
## 
## Call:
## lm(formula = log_antibody ~ time, data = table_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3137 -0.3854  0.0251  0.4127  1.8965 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.0850448  0.0227735 442.842   <2e-16 ***
## time        -0.0001902  0.0001943  -0.979    0.328    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5965 on 4998 degrees of freedom
## Multiple R-squared:  0.0001916,  Adjusted R-squared:  -8.417e-06 
## F-statistic: 0.9579 on 1 and 4998 DF,  p-value: 0.3278
x <- model.matrix(log_antibody ~ ., dat1) [, -1]
y <- dat1$log_antibody

x2 <- model.matrix(log_antibody ~ ., dat2) [, -1]
y2 <- dat2$log_antibody

#cross-validation
ctrl1 = trainControl(method = "repeatedcv",
                     number = 10,
                     repeats = 5,
                     selectionFunction = "best")
#GAM 
set.seed(2)
gam.fit <- train(x, y,
method = "gam",
trControl = ctrl1)
## Loading required package: mgcv
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
gam.fit$bestTune
##   select method
## 2   TRUE GCV.Cp
gam.fit$finalModel
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## .outcome ~ gender + race2 + race3 + race4 + smoking1 + smoking2 + 
##     diabetes + hypertension + s(age) + s(SBP) + s(LDL) + s(bmi) + 
##     s(time) + s(height) + s(weight) + s(id)
## 
## Estimated degrees of freedom:
## 0.991 0.000 0.000 4.661 7.846 1.216 0.000 
## 0.000  total = 23.71 
## 
## GCV score: 0.2786709
gam.m1 <- gam(log_antibody ~ time + age + gender + race + smoking + bmi + diabetes + hypertension + 
    SBP + LDL, data = dat1) 

gam.m2 <- gam(log_antibody ~ s(time) + age + gender + race + smoking + bmi + diabetes + hypertension + 
    SBP + LDL, data = dat1) 

anova(gam.m1, gam.m2, test = "F")
## Analysis of Deviance Table
## 
## Model 1: log_antibody ~ time + age + gender + race + smoking + bmi + diabetes + 
##     hypertension + SBP + LDL
## Model 2: log_antibody ~ s(time) + age + gender + race + smoking + bmi + 
##     diabetes + hypertension + SBP + LDL
##   Resid. Df Resid. Dev     Df Deviance      F    Pr(>F)    
## 1    4986.0     1520.6                                     
## 2    4978.5     1407.0 7.4964   113.57 53.614 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(gam.m2)

#Summary of the model
summary(gam.fit)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## .outcome ~ gender + race2 + race3 + race4 + smoking1 + smoking2 + 
##     diabetes + hypertension + s(age) + s(SBP) + s(LDL) + s(bmi) + 
##     s(time) + s(height) + s(weight) + s(id)
## 
## Parametric coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  10.228204   0.015328 667.284  < 2e-16 ***
## gender       -0.297827   0.014933 -19.944  < 2e-16 ***
## race2        -0.002962   0.033010  -0.090    0.928    
## race3        -0.010567   0.018838  -0.561    0.575    
## race4        -0.037352   0.026175  -1.427    0.154    
## smoking1      0.022213   0.016659   1.333    0.182    
## smoking2     -0.193179   0.025834  -7.478 8.88e-14 ***
## diabetes      0.014216   0.020639   0.689    0.491    
## hypertension -0.007766   0.015995  -0.486    0.627    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                 edf Ref.df      F p-value    
## s(age)    9.910e-01      9 13.737  <2e-16 ***
## s(SBP)    5.214e-07      9  0.000   0.758    
## s(LDL)    5.607e-07      9  0.000   0.634    
## s(bmi)    4.661e+00      9 42.117  <2e-16 ***
## s(time)   7.846e+00      9 44.897  <2e-16 ***
## s(height) 1.216e+00      9  0.277   0.119    
## s(weight) 1.746e-06      9  0.000   0.612    
## s(id)     5.058e-07      9  0.000   0.731    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.22   Deviance explained = 22.4%
## GCV = 0.27867  Scale est. = 0.27735   n = 5000
#MARS
set.seed(2)
mars_grid <- expand.grid(degree = 1:3,
                         nprune = 2:24) 

mars.fit <- train(x, y,
                  method = "earth",
                  tuneGrid = mars_grid,
                  trControl = ctrl1)
## Loading required package: earth
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
#pcr using caret
set.seed(2)

pcr_fit = train(x,y,
                method = "pcr",
                tuneGrid = data.frame(ncomp=1:5),
                trControl = ctrl1,
                preProcess = c("center","scale"))

#pls using caret
pls_fit = train(x,y,
                method="pls",
                tunegrid = data.frame(ncomp=1:5),
                trControl = ctrl1,
                preProcess = c("center","scale"))

#enet using caret
enet_fit = train(log_antibody ~.,
                 data = dat1,
                 method = "glmnet",
                 tuneGrid = expand.grid(alpha=seq(0,1, length = 21),
                                        lambda = exp(seq(6,0,length = 100))),
                 trControl = ctrl1)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
#lasso using caret
set.seed(2)
lasso.fit = train(log_antibody ~.,
                  data = dat1,
                  method = "glmnet",
                  tuneGrid = expand.grid(alpha=1,
                                         lambda = exp(seq(6,0,length=100))),
                  trControl = ctrl1)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
#lm using caret
lm.fit = train(log_antibody ~.,
               data = dat1,
               method = "lm",
               trControl = ctrl1)

#ridge using caret
ridge.fit = train(log_antibody ~.,
                  data = dat1,
                  method = "glmnet",
                  tuneGrid = expand.grid(alpha=1,
                                         lambda = exp(seq(6,0,length=100))),
                  trControl = ctrl1)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
#comparing models based on resampling results
resamp = resamples(list(lasso = lasso.fit,
                        lm = lm.fit,
                        ridge = ridge.fit))
bwplot(resamp, metric = "RMSE")

#comparing models based on resampling results
resamp = resamples(list(pcr = pcr_fit,
                        pls = pls_fit,
                        enet = enet_fit,
                        gam = gam.fit,
                        mars = mars.fit,
                        lasso = lasso.fit,
                        lm = lm.fit,
                        ridge = ridge.fit))
bwplot(resamp, metric = "RMSE")

Based on the resampling results, the MARS model is the “best” fit as it has the lowest RMSE.

#creating a piece wise linear model using multivairiate adaptive regression splines (MARS)
ggplot(mars.fit)

mars.fit$bestTune
##   nprune degree
## 8      9      1
coef(mars.fit$finalModel)
##  (Intercept)  h(27.8-bmi)   h(time-57)   h(57-time)       gender    h(age-59) 
## 10.847446930 -0.061997354 -0.002254182 -0.033529326 -0.296290451 -0.022957648 
##    h(59-age)     smoking2  h(bmi-23.7) 
##  0.016138468 -0.205126851 -0.084380175
#understanding the relationship between the coefficients from the final model and lpsa
p1 = pdp::partial(mars.fit, pred.var =c("time"), grid.resolution=10) |> autoplot()
p2 = pdp::partial(mars.fit, pred.var =c("smoking2"), grid.resolution=10) |> autoplot()
p3 = pdp::partial(mars.fit, pred.var =c("gender"), grid.resolution=10) |> autoplot()

p4 = pdp::partial(mars.fit, pred.var = c("age","bmi"), grid.resolution = 10) |>
  pdp::plotPartial(levelplot = FALSE, zlab="yhat", drape=TRUE, screen = list(z=20, x=-60))

gridExtra::grid.arrange(p1, p2, p3, p4, ncol=2, nrow=2)

#evaluating final model
predy2_mars2 = predict(mars.fit, newdata=x2)
mean((y2 - predy2_mars2)^2)
## [1] 0.2838458

The MSE for the final model when using the test data is 0.2838458.